library(tidyverse)
library(data.table)
library(caret)
library(ggthemes)
library(lubridate)
trainperf <- fread("trainperf.csv")
traindemographics <-fread("traindemographics.csv")
testperf <-fread("testperf.csv")
testdemographics <-fread("testdemographics.csv")
SampleSubmission <- fread("SampleSubmission.csv")
train_data <- merge(traindemographics, trainperf, all.y = T, by = "customerid")
test_data <- merge(testdemographics, testperf, all.y = T, by = "customerid")
loan_data <- rbind(train_data[, set := "train"], test_data[, set := "test"], fill = T)
dates <- c("birthdate" ,"approveddate", "creationdate" )
loan_data[, (dates) := lapply(.SD, as.Date), .SDcols = dates]
loan_data[, age := (as.numeric(approveddate - birthdate))/365]
loan_data[, aprove_month := month(approveddate)]
loan_data[, approve_day := wday(approveddate)]
loan_data[, approve_year := year(approveddate)]
loan_data[!is.na(good_bad_flag), .N, by = .(good_bad_flag)] %>%
    .[, perc := round(N/sum(N) * 100, 2)] %>%
    
     ggplot(aes(good_bad_flag, perc, fill =good_bad_flag)) +
     geom_bar(stat = "identity") +
     geom_text(aes(good_bad_flag, perc, label = paste(perc, "%"),
                   vjust = .05, hjust = .5),
               size = 4)+
     theme_hc()+
    labs(title = "Percentage of bad loans")+
     scale_fill_colorblind(name = "")+
    theme(legend.position = "none")

chars <- c("bank_account_type", "bank_name_clients", 
           "bank_branch_clients", "employment_status_clients",
           "level_of_education_clients")

loan_data[, (chars) := lapply(.SD, function(x) ifelse(x == "" | x == " ", NA, x)), .SDcols = chars]
naVals <- colSums(is.na(loan_data))/nrow(loan_data) * 100 

withNa <- naVals[naVals>0]
nms_na <- names(withNa)
missing_perc <- data.table(variables = nms_na, perc = withNa) 

ggplot(missing_perc, aes( reorder(variables, perc), perc))+
    geom_bar(stat = "identity") +
    theme_fivethirtyeight()+
    coord_flip()

loan_data[, loannumber := as.numeric(loannumber)]
missing_var_del <- missing_perc[perc>50, variables]
## KNN imputation
library(VIM)
loan_data[, (dates):= NULL]
loan_data[, referredby:= NULL]
loan_data <- kNN(loan_data,useImputedDist = FALSE, k =10)

setDT(loan_data)
nms_all <- names(loan_data)
nms_imp <- nms_all[grepl("_imp$", nms_all)]


loan_data[, (nms_imp) := lapply(.SD, 
                            function(x) ifelse(x == FALSE, 0, 1)),
      .SDcols = nms_imp]

col_sum_imp <- loan_data[, colSums(.SD), .SDcols = nms_imp]
col_sum_imp <- names(col_sum_imp[col_sum_imp == 0])
#var_importants <- fread("var_importanta.csv")
loan_data[, (col_sum_imp) := NULL]
loan_data[, good_bad_flag := factor(good_bad_flag, levels = c("Bad", "Good"))]

nms_del1 <- c("set_imp", " good_bad_flag_imp", 
              "approve_year","aprove_month", 
              "year","systemloanid" )

loan_data[, (nms_del1) := NULL]

class_nms <- sapply(loan_data, class)
nums <- class_nms[class_nms == "numeric"] %>% names()
nums <- nums[!grepl("_imp|good_bad_flag", nums)]

zero_one <- function(x){
    
    myvar <- (x - min(x))/(max(x) - min(x))
    
    myvar
}


loan_data[, (nums) := lapply(.SD, zero_one), .SDcols = nums]


train_data <- loan_data[set == "train"]
train_data[, set:= NULL]
test_data <- loan_data[set == "test"]
test_data[, set:= NULL]
train_bad <- train_data[good_bad_flag == "Bad"]
train_good <- train_data[good_bad_flag == "Good"]
n_row = nrow(train_good)
n_row_dead = nrow(train_bad)

set.seed(200)
not_sample <- sample(1:n_row, n_row_dead)
train_good <- train_good[not_sample]
train_sampled <- rbind(train_bad, train_good)
## Model Cross validation

set.seed(100)
cv_fold <- createFolds(train_sampled$good_bad_flag, k = 10)

train_ctrl <- trainControl(method = "cv",
                        number = 10,
                        summaryFunction = twoClassSummary,
                        classProbs = TRUE,
                        allowParallel=T,
                        index = cv_fold,
                        verboseIter = FALSE,
                        savePredictions = TRUE,
                        search = "grid")


xgb_grid <- expand.grid(nrounds = c(50,100),
                        eta = 0.4,
                        max_depth = c(2,3),
                        gamma = c(0, .01),
                        colsample_bytree = c(0.6, .8, 1),
                        min_child_weight = 1,
                        subsample =  c(.5, .8, 1))

 
ranger_grid <- expand.grid(splitrule = c("extratrees", "gini"),
                        mtry = c(10, 20, (ncol(train_data) - 2) ),
                        min.node.size = c(1, 5))

svm_grid <- expand.grid(C = c( 1, 3, 5, 20),
                        sigma = seq(0.001, 0.524 , length.out = 7))
library(caret)
library(caretEnsemble)
library(tictoc)
#tuneGrid= xgb_grid
tic()

model_list <- caretList(
   good_bad_flag~.,
    data=train_sampled[, .SD, .SDcols = !"customerid"],
    metric = "ROC",
    trControl=train_ctrl,
    tuneList = list(caretModelSpec(method="xgbTree",tuneGrid= xgb_grid ),
                    caretModelSpec(method = "svmRadial", tuneGrid = svm_grid),
                    caretModelSpec(method="ranger", tuneGrid= ranger_grid)

                   )
)

toc()
## 165.1 sec elapsed
model_list
## $xgbTree
## eXtreme Gradient Boosting 
## 
## 1906 samples
##   25 predictor
##    2 classes: 'Bad', 'Good' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 190, 191, 191, 190, 190, 191, ... 
## Resampling results across tuning parameters:
## 
##   max_depth  gamma  colsample_bytree  subsample  nrounds  ROC        Sens     
##   2          0.00   0.6               0.5         50      0.6675440  0.6278507
##   2          0.00   0.6               0.5        100      0.6597599  0.6304177
##   2          0.00   0.6               0.8         50      0.7217087  0.6621213
##   2          0.00   0.6               0.8        100      0.7043960  0.6627110
##   2          0.00   0.6               1.0         50      0.7397850  0.6685301
##   2          0.00   0.6               1.0        100      0.7215911  0.6707503
##   2          0.00   0.8               0.5         50      0.6585361  0.6123478
##   2          0.00   0.8               0.5        100      0.6525907  0.6214439
##   2          0.00   0.8               0.8         50      0.7186698  0.6599123
##   2          0.00   0.8               0.8        100      0.6986109  0.6564184
##   2          0.00   0.8               1.0         50      0.7408503  0.6595522
##   2          0.00   0.8               1.0        100      0.7235121  0.6673740
##   2          0.00   1.0               0.5         50      0.6664863  0.6193329
##   2          0.00   1.0               0.5        100      0.6570109  0.6271548
##   2          0.00   1.0               0.8         50      0.7107864  0.6501159
##   2          0.00   1.0               0.8        100      0.6971123  0.6602676
##   2          0.00   1.0               1.0         50      0.7367861  0.6664325
##   2          0.00   1.0               1.0        100      0.7189294  0.6642244
##   2          0.01   0.6               0.5         50      0.6587045  0.6265695
##   2          0.01   0.6               0.5        100      0.6575760  0.6348562
##   2          0.01   0.6               0.8         50      0.7198368  0.6625938
##   2          0.01   0.6               0.8        100      0.7049304  0.6590954
##   2          0.01   0.6               1.0         50      0.7421705  0.6600194
##   2          0.01   0.6               1.0        100      0.7245672  0.6678356
##   2          0.01   0.8               0.5         50      0.6594476  0.6091857
##   2          0.01   0.8               0.5        100      0.6566138  0.6249385
##   2          0.01   0.8               0.8         50      0.7179802  0.6610740
##   2          0.01   0.8               0.8        100      0.7015752  0.6516354
##   2          0.01   0.8               1.0         50      0.7413740  0.6526739
##   2          0.01   0.8               1.0        100      0.7221938  0.6643373
##   2          0.01   1.0               0.5         50      0.6542567  0.6133859
##   2          0.01   1.0               0.5        100      0.6567132  0.6263353
##   2          0.01   1.0               0.8         50      0.7143992  0.6636411
##   2          0.01   1.0               0.8        100      0.6994088  0.6540859
##   2          0.01   1.0               1.0         50      0.7367861  0.6664325
##   2          0.01   1.0               1.0        100      0.7178856  0.6650403
##   3          0.00   0.6               0.5         50      0.6521986  0.6110577
##   3          0.00   0.6               0.5        100      0.6516902  0.6220238
##   3          0.00   0.6               0.8         50      0.7066189  0.6566458
##   3          0.00   0.6               0.8        100      0.6907217  0.6482569
##   3          0.00   0.6               1.0         50      0.7241909  0.6659743
##   3          0.00   0.6               1.0        100      0.7048806  0.6622484
##   3          0.00   0.8               0.5         50      0.6533295  0.6257437
##   3          0.00   0.8               0.5        100      0.6496811  0.6112926
##   3          0.00   0.8               0.8         50      0.7000046  0.6522196
##   3          0.00   0.8               0.8        100      0.6867139  0.6448741
##   3          0.00   0.8               1.0         50      0.7187715  0.6624766
##   3          0.00   0.8               1.0        100      0.7027482  0.6592190
##   3          0.00   1.0               0.5         50      0.6520713  0.6184046
##   3          0.00   1.0               0.5        100      0.6429416  0.6128115
##   3          0.00   1.0               0.8         50      0.6927271  0.6428846
##   3          0.00   1.0               0.8        100      0.6814026  0.6448708
##   3          0.00   1.0               1.0         50      0.7183425  0.6635239
##   3          0.00   1.0               1.0        100      0.7000319  0.6558334
##   3          0.01   0.6               0.5         50      0.6635055  0.6273885
##   3          0.01   0.6               0.5        100      0.6530193  0.6209713
##   3          0.01   0.6               0.8         50      0.6986443  0.6549018
##   3          0.01   0.6               0.8        100      0.6855622  0.6474383
##   3          0.01   0.6               1.0         50      0.7224444  0.6648053
##   3          0.01   0.6               1.0        100      0.7060696  0.6599160
##   3          0.01   0.8               0.5         50      0.6599489  0.6207412
##   3          0.01   0.8               0.5        100      0.6518656  0.6219056
##   3          0.01   0.8               0.8         50      0.6984896  0.6516337
##   3          0.01   0.8               0.8        100      0.6849219  0.6423148
##   3          0.01   0.8               1.0         50      0.7194476  0.6632932
##   3          0.01   0.8               1.0        100      0.7032396  0.6542018
##   3          0.01   1.0               0.5         50      0.6571391  0.6179347
##   3          0.01   1.0               0.5        100      0.6494040  0.6186328
##   3          0.01   1.0               0.8         50      0.6957470  0.6474355
##   3          0.01   1.0               0.8        100      0.6831119  0.6391576
##   3          0.01   1.0               1.0         50      0.7173461  0.6650392
##   3          0.01   1.0               1.0        100      0.6990038  0.6590969
##   Spec     
##   0.6188807
##   0.6012625
##   0.6606161
##   0.6384609
##   0.6879020
##   0.6516350
##   0.6129367
##   0.5996342
##   0.6646959
##   0.6300672
##   0.6967615
##   0.6592170
##   0.6160751
##   0.5952051
##   0.6516380
##   0.6188798
##   0.6845206
##   0.6532707
##   0.5998692
##   0.5961439
##   0.6601507
##   0.6428915
##   0.6952461
##   0.6611943
##   0.6140933
##   0.5989399
##   0.6538544
##   0.6426639
##   0.7032929
##   0.6570017
##   0.6124558
##   0.5922880
##   0.6507087
##   0.6384675
##   0.6845206
##   0.6481425
##   0.6037164
##   0.5903130
##   0.6449907
##   0.6258717
##   0.6616637
##   0.6388155
##   0.5921684
##   0.5964857
##   0.6354301
##   0.6236545
##   0.6512880
##   0.6343834
##   0.5998734
##   0.5877549
##   0.6314659
##   0.6157237
##   0.6584013
##   0.6319372
##   0.6109569
##   0.5966067
##   0.6305436
##   0.6178262
##   0.6574691
##   0.6367166
##   0.6029056
##   0.5904248
##   0.6322864
##   0.6168991
##   0.6573545
##   0.6389298
##   0.6046464
##   0.5959014
##   0.6361284
##   0.6238861
##   0.6559509
##   0.6291390
## 
## Tuning parameter 'eta' was held constant at a value of 0.4
## Tuning
##  parameter 'min_child_weight' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 50, max_depth = 2, eta
##  = 0.4, gamma = 0.01, colsample_bytree = 0.6, min_child_weight = 1
##  and subsample = 1.
## 
## $svmRadial
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 1906 samples
##   25 predictor
##    2 classes: 'Bad', 'Good' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 190, 191, 191, 190, 190, 191, ... 
## Resampling results across tuning parameters:
## 
##   C   sigma       ROC        Sens       Spec     
##    1  0.00100000  0.4412492  0.6618623  0.3382454
##    1  0.08816667  0.7063337  0.6683058  0.6214489
##    1  0.17533333  0.7256582  0.6664370  0.6602756
##    1  0.26250000  0.6919439  0.6315584  0.6479259
##    1  0.34966667  0.7220906  0.6364704  0.6830113
##    1  0.43683333  0.7185227  0.6428808  0.6727497
##    1  0.52400000  0.7147828  0.6288937  0.6889579
##    3  0.00100000  0.5163466  0.4034898  0.6034076
##    3  0.08816667  0.7503086  0.6594411  0.7144810
##    3  0.17533333  0.7423805  0.6590902  0.7084203
##    3  0.26250000  0.7341790  0.6597951  0.6972255
##    3  0.34966667  0.7283172  0.6511635  0.6996721
##    3  0.43683333  0.7235423  0.6539646  0.6892970
##    3  0.52400000  0.7193765  0.6398573  0.6978099
##    5  0.00100000  0.5098943  0.3711403  0.6181310
##    5  0.08816667  0.7518890  0.6658526  0.7127298
##    5  0.17533333  0.7392287  0.6663227  0.6972262
##    5  0.26250000  0.7330108  0.6696994  0.6894161
##    5  0.34966667  0.7280544  0.6606077  0.6890720
##    5  0.43683333  0.7234473  0.6518581  0.6866204
##    5  0.52400000  0.7191750  0.6540742  0.6824247
##   20  0.00100000  0.6383193  0.5461143  0.6220235
##   20  0.08816667  0.7403581  0.6672478  0.6976935
##   20  0.17533333  0.7356718  0.6714449  0.6840560
##   20  0.26250000  0.7299851  0.6692351  0.6732156
##   20  0.34966667  0.7241865  0.6602530  0.6715803
##   20  0.43683333  0.7198412  0.6669151  0.6552680
##   20  0.52400000  0.7157682  0.6442871  0.6783449
## 
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.08816667 and C = 5.
## 
## $ranger
## Random Forest 
## 
## 1906 samples
##   25 predictor
##    2 classes: 'Bad', 'Good' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 190, 191, 191, 190, 190, 191, ... 
## Resampling results across tuning parameters:
## 
##   splitrule   mtry  min.node.size  ROC        Sens       Spec     
##   extratrees  10    1              0.8049571  0.6909181  0.7667202
##   extratrees  10    5              0.8047689  0.6852059  0.7756981
##   extratrees  20    1              0.8057977  0.6880028  0.7720858
##   extratrees  20    5              0.8060334  0.6829887  0.7811791
##   extratrees  25    1              0.8052493  0.6875334  0.7762797
##   extratrees  25    5              0.8045445  0.6806559  0.7821136
##   gini        10    1              0.7769078  0.6985025  0.7100580
##   gini        10    5              0.7781485  0.6940717  0.7176355
##   gini        20    1              0.7618497  0.6818278  0.7045735
##   gini        20    5              0.7651843  0.6825285  0.7049258
##   gini        25    1              0.7603744  0.6791466  0.7015445
##   gini        25    5              0.7591756  0.6797294  0.7008435
## 
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 20, splitrule = extratrees
##  and min.node.size = 5.
## 
## attr(,"class")
## [1] "caretList"
resamples_models <- resamples(model_list)

dotplot(resamples_models, metric = "Sens")

Model Statistics

nms_models <- names(model_list)
resamples_stat_list <- list()
for (j in 1:length(nms_models)) {
  model1 = model_list[[j]]
  resample_stata <- thresholder(model1, 
                              threshold = seq(.0, 1, by = 0.01), 
                              final = TRUE, 
                              statistics = "all") %>% setDT()
  p= ggplot(resample_stata , aes(x = prob_threshold, y = F1, col = "F1")) + 
  geom_point() + 
  geom_point(aes(y = Sensitivity, col = "Sensitivity"))+
  scale_x_continuous(breaks = seq(0, 1, by =.1))+
    ggtitle(nms_models[j])
  print(p)
  resample_stata[, model:= nms_models[j]]
  resamples_stat_list[[j]] = resample_stata
}

ROC CURVE

resamples_combined <- rbindlist(resamples_stat_list, fill = TRUE)
library(plotly)
ggplotly(ggplot(resamples_combined  , aes(x = 1-Specificity, y = Recall, color = model)) + 
  geom_line(size = 1) + 
  #geom_point(aes(y = Sensitivity, col = "Sensitivity"))+
  scale_x_continuous(breaks = seq(0, 1, by =.1)) +
  ggtitle(paste0("ROC for models"))+
  scale_color_viridis_d())

Precision Recall Curve

ggplotly(ggplot(resamples_combined ,
                aes( x = Recall, y = Precision, color = model)) + 
  geom_line(size = 1) + 
  #geom_point(aes(y = Sensitivity, col = "Sensitivity"))+
  scale_x_continuous(breaks = seq(0, 1, by =.1))+
  scale_color_viridis_d()+
  ggtitle(paste0("Precision recall curve")))
library(iml)
X_pred <-train_sampled[, .SD, .SDcols = !c("customerid", "good_bad_flag")] %>%
  as.data.frame()

nms_models <- names(model_list)

iml_models <- list()

for (i in 1:length(nms_models)) {
  
  chain_rf_a <- model_list[[i]]
  pred <- function(chain_rf_a, train_sampled)  {
    results <- predict(chain_rf_a, newdata = train_sampled, type = "prob")
    return(results[[1L]])
  }
  
  # it does not know how to deal with char values


# get predicted values
  iml_models[[i]] <- Predictor$new(model = chain_rf_a, 
                      data =X_pred,
                      predict.function = pred,
                      y = train_sampled$good_bad_flag)


}

Feature Importance

Feature Importance

plots <- list()
for (i in 1:length(nms_models)) {
  model_this = iml_models[[i]]
  impa <- FeatureImp$new(model_this, loss = "ce")
  var_importanta <-impa$results %>% data.table()

  #write.csv(var_importanta, file = "var_importanta.csv", row.names = F)
  setorder(var_importanta, -importance)
  var10a <- var_importanta[1:20]
  if(i == 2) write.csv(var10a, file = "svm_var.csv", row.names = F)
  plots[[i]] <- ggplot(var10a, aes(reorder(feature,importance), importance))+
  geom_point()+
  ggtitle(nms_models[i])+
   geom_linerange(aes(ymin=importance.05, ymax= importance.95), width=.3,
                  position=position_dodge(width = .7)) +
  coord_flip()
  
  
}

plots
## [[1]]

## 
## [[2]]

## 
## [[3]]

Shap Values

nms <- names(model_list)
ids <- which(nms == "ranger")
shap_list <- vector("list", nrow(X_pred))
model_list_shap <- list()
model_this <- iml_models[[ids]]

tic()

#shap_list[[1]] <- shap_import

for (i in 1:nrow(X_pred)) {
  shap <- Shapley$new(model_this,  x.interest = X_pred[i, ], sample.size = 30)
  shap_import <-shap$results %>% data.table()
  shap_import <- shap_import[class == "Bad"]
  shap_list[[i]] <- shap_import[,
                                customerid := train_sampled[i, customerid]]

  }
toc()
## 3085.19 sec elapsed
shap_values <- rbindlist(shap_list, fill = T)

write.csv(shap_values, file = "shap_values.csv", row.names = F)
library(ggforce)
shap_values <-  fread("shap_values.csv")

shap_values[, phi2 := abs(phi)]
shap_imp <- shap_values[, .(Med = median(phi2),
                            Mean = mean(phi2)), by = feature] %>%
    setorder(-Mean)
shap_imp <- shap_imp[1:20, ]

shap_values <- shap_values[feature %in%shap_imp$feature]

shap_values[, feature := factor(feature, levels = rev(shap_imp$feature) )]

ggplot(shap_values, aes(feature, phi,  color = phi.var))+
  geom_sina()+
  geom_hline(yintercept = 0) +
  scale_color_gradient(low="#2187E3", high="#F32858", 
                       breaks=c(0,1), labels=c("Low","High"))+ 
  theme_bw() + 
    theme(axis.line.y = element_blank(), 
          axis.ticks.y = element_blank(), # remove axis line
          legend.position="bottom") +
  coord_flip()